In [7]:
library(MASS)

In [8]:
# install.packages('/notebooks/ISLR_1.0.tar.gz')

In [9]:
library(ISLR) # library from the book

simple linear regression


In [10]:
names(Boston) # housing values in suburbs of Boston; part of MASS library


Out[10]:
  1. 'crim'
  2. 'zn'
  3. 'indus'
  4. 'chas'
  5. 'nox'
  6. 'rm'
  7. 'age'
  8. 'dis'
  9. 'rad'
  10. 'tax'
  11. 'ptratio'
  12. 'black'
  13. 'lstat'
  14. 'medv'

In [11]:
?Boston


Out[11]:
Boston {MASS}R Documentation

Housing Values in Suburbs of Boston

Description

The Boston data frame has 506 rows and 14 columns.

Usage

Boston

Format

This data frame contains the following columns:

crim

per capita crime rate by town.

zn

proportion of residential land zoned for lots over 25,000 sq.ft.

indus

proportion of non-retail business acres per town.

chas

Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).

nox

nitrogen oxides concentration (parts per 10 million).

rm

average number of rooms per dwelling.

age

proportion of owner-occupied units built prior to 1940.

dis

weighted mean of distances to five Boston employment centres.

rad

index of accessibility to radial highways.

tax

full-value property-tax rate per \$10,000.

ptratio

pupil-teacher ratio by town.

black

1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.

lstat

lower status of the population (percent).

medv

median value of owner-occupied homes in \$1000s.

Source

Harrison, D. and Rubinfeld, D.L. (1978) Hedonic prices and the demand for clean air. J. Environ. Economics and Management 5, 81–102.

Belsley D.A., Kuh, E. and Welsch, R.E. (1980) Regression Diagnostics. Identifying Influential Data and Sources of Collinearity. New York: Wiley.


[Package MASS version 7.3-44 ]

In [12]:
# lstat: lower status of the population (percent)
# medv: median value of owner-occupied homes in $1000s

par(mfrow = c(1,2))
plot(Boston$lstat, Boston$medv)
# different notation, same result: plots Y-axis~X-axis from Boston dataframe
plot(medv~lstat, Boston)



In [13]:
# create a linear model about the same variables
# reads as: response medv is modelled as (the single predictor) lstat
fit1=lm(medv~lstat, data=Boston)

In [14]:
fit1 # there's a negative correlation


Out[14]:
Call:
lm(formula = medv ~ lstat, data = Boston)

Coefficients:
(Intercept)        lstat  
      34.55        -0.95  

In [15]:
# as the % of 'lower status' inhabitants increases,
# the median housing value decreases 
summary(fit1) # very significant results


Out[15]:
Call:
lm(formula = medv ~ lstat, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.168  -3.990  -1.318   2.034  24.500 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 34.55384    0.56263   61.41   <2e-16 ***
lstat       -0.95005    0.03873  -24.53   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.216 on 504 degrees of freedom
Multiple R-squared:  0.5441,	Adjusted R-squared:  0.5432 
F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16

In [16]:
plot(medv~lstat, Boston)
abline(fit1, col="red") # abline: add straight lines to a plot



In [17]:
names(fit1)


Out[17]:
  1. 'coefficients'
  2. 'residuals'
  3. 'effects'
  4. 'rank'
  5. 'fitted.values'
  6. 'assign'
  7. 'qr'
  8. 'df.residual'
  9. 'xlevels'
  10. 'call'
  11. 'terms'
  12. 'model'

In [18]:
# 95% confidence intervals for the coefficents of the fit
confint(fit1)


Out[18]:
2.5 %97.5 %
(Intercept)33.4484635.65922
lstat-1.0261482-0.8739505

In [19]:
# predict() can be used to query a model fit
# here: predict medv (and conf intervals)  given 3 new values for lstat
predict(fit1, data.frame(lstat=c(5,10,15)), interval="confidence")
# returns the fit at the 3 vals, their lower/upper confidence intervals


Out[19]:
fitlwrupr
129.8035929.0074130.59978
225.0533524.4741325.63256
320.3031019.7315920.87461

In [20]:
# plot(fit1) # gives lots of plots / stat summaries

multiple linear regression


In [21]:
fit2=lm(medv~lstat+age, data=Boston)
summary(fit2)
# age is also a significant predictor for medv,
# but not as strong as lstat

# multiple R-squared (% of variance explained; higher is better)


Out[21]:
Call:
lm(formula = medv ~ lstat + age, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.981  -3.978  -1.283   1.968  23.158 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 33.22276    0.73085  45.458  < 2e-16 ***
lstat       -1.03207    0.04819 -21.416  < 2e-16 ***
age          0.03454    0.01223   2.826  0.00491 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.173 on 503 degrees of freedom
Multiple R-squared:  0.5513,	Adjusted R-squared:  0.5495 
F-statistic:   309 on 2 and 503 DF,  p-value: < 2.2e-16

In [22]:
# use all variables (except for medv) to predict medv
fit3=lm(medv~.,Boston)
summary(fit3)
# age: was significant, when it was in the model w/ lstat
# but w/ all other predictors, it is no longer significant


Out[22]:
Call:
lm(formula = medv ~ ., data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.595  -2.730  -0.518   1.777  26.199 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
zn           4.642e-02  1.373e-02   3.382 0.000778 ***
indus        2.056e-02  6.150e-02   0.334 0.738288    
chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
age          6.922e-04  1.321e-02   0.052 0.958229    
dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
black        9.312e-03  2.686e-03   3.467 0.000573 ***
lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.745 on 492 degrees of freedom
Multiple R-squared:  0.7406,	Adjusted R-squared:  0.7338 
F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16

In [23]:
par(mfrow = c(2,2))
plot(fit3)
# residuals vs. fitted values: we're looking for non-linearity
# we see here, that the model doesn't capture everything that is
# going on



In [24]:
# update(): update a model
# same response, use all predictors from fit3 except for age and indus
fit4=update(fit3, ~.-age-indus)
# we're keeping only very significant predictors

In [25]:
par(mfrow = c(2,2))
plot(fit4)



In [26]:
summary(fit4)


Out[26]:
Call:
lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + 
    tax + ptratio + black + lstat, data = Boston)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.5984  -2.7386  -0.5046   1.7273  26.2373 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  36.341145   5.067492   7.171 2.73e-12 ***
crim         -0.108413   0.032779  -3.307 0.001010 ** 
zn            0.045845   0.013523   3.390 0.000754 ***
chas          2.718716   0.854240   3.183 0.001551 ** 
nox         -17.376023   3.535243  -4.915 1.21e-06 ***
rm            3.801579   0.406316   9.356  < 2e-16 ***
dis          -1.492711   0.185731  -8.037 6.84e-15 ***
rad           0.299608   0.063402   4.726 3.00e-06 ***
tax          -0.011778   0.003372  -3.493 0.000521 ***
ptratio      -0.946525   0.129066  -7.334 9.24e-13 ***
black         0.009291   0.002674   3.475 0.000557 ***
lstat        -0.522553   0.047424 -11.019  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.736 on 494 degrees of freedom
Multiple R-squared:  0.7406,	Adjusted R-squared:  0.7348 
F-statistic: 128.2 on 11 and 494 DF,  p-value: < 2.2e-16

nonlinear terms and interactions


In [27]:
# linear model with an interaction between lstat and age
fit5 = lm(medv~lstat*age, Boston)
# there's a somewhat significant interaction betw. lstat and age
# (although age in itself is not significant)

In [28]:
summary(fit5)


Out[28]:
Call:
lm(formula = medv ~ lstat * age, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.806  -4.045  -1.333   2.085  27.552 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 36.0885359  1.4698355  24.553  < 2e-16 ***
lstat       -1.3921168  0.1674555  -8.313 8.78e-16 ***
age         -0.0007209  0.0198792  -0.036   0.9711    
lstat:age    0.0041560  0.0018518   2.244   0.0252 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.149 on 502 degrees of freedom
Multiple R-squared:  0.5557,	Adjusted R-squared:  0.5531 
F-statistic: 209.3 on 3 and 502 DF,  p-value: < 2.2e-16

In [29]:
# we use a quadratic term `lstat^2` here,
# but `^` means sth. in this formula language, so we'll
# escape it using the identity function I()

fit6 = lm(medv~lstat +I(lstat^2), Boston)

In [30]:
summary(fit6)


Out[30]:
Call:
lm(formula = medv ~ lstat + I(lstat^2), data = Boston)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.2834  -3.8313  -0.5295   2.3095  25.4148 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 42.862007   0.872084   49.15   <2e-16 ***
lstat       -2.332821   0.123803  -18.84   <2e-16 ***
I(lstat^2)   0.043547   0.003745   11.63   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5.524 on 503 degrees of freedom
Multiple R-squared:  0.6407,	Adjusted R-squared:  0.6393 
F-statistic: 448.5 on 2 and 503 DF,  p-value: < 2.2e-16

In [31]:
attach(Boston) # put Boston vars in local namespace

In [32]:
plot(medv~lstat)
# add quadratic fit to plot,
# i.e. for each val from lstat print the fitted val of the model.
# we can't use abline(), b/c this isn't linear
points(lstat, fitted(fit6), col="red", pch=20)



In [33]:
plot(medv~lstat)
# fit medv as a polynomial of degree 4 in lstat
fit7 = lm(medv~poly(lstat, 4))
points(lstat, fitted(fit7), col="blue", pch=20)
# risk of overfitting the data


available plotting characters


In [34]:
# cex: how much should the plotting symbol be magnified
# e.g. cex.axis=2 increases axis labels
plot(1:20, 1:20, pch=1:20, cex=2)


Qualitative predictors


In [40]:
# ?Carseats # simul. data set containing sales of child car seats at 400 different stores. 
names(Carseats) # TODO: download other library


Out[40]:
  1. 'Sales'
  2. 'CompPrice'
  3. 'Income'
  4. 'Advertising'
  5. 'Population'
  6. 'Price'
  7. 'ShelveLoc'
  8. 'Age'
  9. 'Education'
  10. 'Urban'
  11. 'US'

In [38]:
summary(Carseats)


Out[38]:
     Sales          CompPrice       Income        Advertising    
 Min.   : 0.000   Min.   : 77   Min.   : 21.00   Min.   : 0.000  
 1st Qu.: 5.390   1st Qu.:115   1st Qu.: 42.75   1st Qu.: 0.000  
 Median : 7.490   Median :125   Median : 69.00   Median : 5.000  
 Mean   : 7.496   Mean   :125   Mean   : 68.66   Mean   : 6.635  
 3rd Qu.: 9.320   3rd Qu.:135   3rd Qu.: 91.00   3rd Qu.:12.000  
 Max.   :16.270   Max.   :175   Max.   :120.00   Max.   :29.000  
   Population        Price        ShelveLoc        Age          Education   
 Min.   : 10.0   Min.   : 24.0   Bad   : 96   Min.   :25.00   Min.   :10.0  
 1st Qu.:139.0   1st Qu.:100.0   Good  : 85   1st Qu.:39.75   1st Qu.:12.0  
 Median :272.0   Median :117.0   Medium:219   Median :54.50   Median :14.0  
 Mean   :264.8   Mean   :115.8                Mean   :53.32   Mean   :13.9  
 3rd Qu.:398.5   3rd Qu.:131.0                3rd Qu.:66.00   3rd Qu.:16.0  
 Max.   :509.0   Max.   :191.0                Max.   :80.00   Max.   :18.0  
 Urban       US     
 No :118   No :142  
 Yes:282   Yes:258  
                    
                    
                    
                    

In [44]:
# use all predictors from the frame (except for Sales) to predict Sales
# add interactions betw. Income and Advertising, as well as Age and Price
fit1 = lm(Sales~.+Income:Advertising+Age:Price, Carseats)

In [46]:
summary(fit1)
# signif. interaction betw. Income and Advertising
# Age:Price interaction isn't signif.


Out[46]:
Call:
lm(formula = Sales ~ . + Income:Advertising + Age:Price, data = Carseats)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9208 -0.7503  0.0177  0.6754  3.3413 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         6.5755654  1.0087470   6.519 2.22e-10 ***
CompPrice           0.0929371  0.0041183  22.567  < 2e-16 ***
Income              0.0108940  0.0026044   4.183 3.57e-05 ***
Advertising         0.0702462  0.0226091   3.107 0.002030 ** 
Population          0.0001592  0.0003679   0.433 0.665330    
Price              -0.1008064  0.0074399 -13.549  < 2e-16 ***
ShelveLocGood       4.8486762  0.1528378  31.724  < 2e-16 ***
ShelveLocMedium     1.9532620  0.1257682  15.531  < 2e-16 ***
Age                -0.0579466  0.0159506  -3.633 0.000318 ***
Education          -0.0208525  0.0196131  -1.063 0.288361    
UrbanYes            0.1401597  0.1124019   1.247 0.213171    
USYes              -0.1575571  0.1489234  -1.058 0.290729    
Income:Advertising  0.0007510  0.0002784   2.698 0.007290 ** 
Price:Age           0.0001068  0.0001333   0.801 0.423812    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.011 on 386 degrees of freedom
Multiple R-squared:  0.8761,	Adjusted R-squared:  0.8719 
F-statistic:   210 on 13 and 386 DF,  p-value: < 2.2e-16

In [50]:
#?contrasts # Set and view the contrasts associated with a factor.

In [51]:
contrasts(Carseats$ShelveLoc)
# shows how R will encode that variable in a linear model
# ShelveLoc: a 3-level factor


Out[51]:
GoodMedium
Bad00
Good10
Medium01

write R functions

fit a regression model and make a plot


In [106]:
regplot <- function(x,y) {
    fit = lm(y~x)
    plot(x,y)
    abline(fit, col="red")
    summary(fit)
}

In [105]:
regplot(Carseats$Price, Carseats$Sales)


Out[105]:
Call:
lm(formula = y ~ x)

Coefficients:
(Intercept)            x  
   13.64192     -0.05307  

In [109]:
# ... parameter is like kwargs in Python

regplot <- function(x,y, ...) {
    fit = lm(y~x)
    plot(x,y, ...)
    abline(fit, col="red")
    summary(fit)
    return(fit)
}

In [108]:
regplot(Carseats$Price, Carseats$Sales, xlab="Price", ylab="Sales", col="blue", pch=20)


Out[108]:
Call:
lm(formula = y ~ x)

Coefficients:
(Intercept)            x  
   13.64192     -0.05307  

In [93]:
carfit = lm(Price ~ Sales, data = Carseats)
summary(carfit)


Out[93]:
Call:
lm(formula = Price ~ Sales, data = Carseats)

Residuals:
    Min      1Q  Median      3Q     Max 
-80.851 -15.332   0.528  13.315  57.791 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 143.7589     3.0143  47.692   <2e-16 ***
Sales        -3.7304     0.3763  -9.912   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 21.23 on 398 degrees of freedom
Multiple R-squared:  0.198,	Adjusted R-squared:  0.196 
F-statistic: 98.25 on 1 and 398 DF,  p-value: < 2.2e-16

In [91]:
testfit = regplot(Carseats$Price, Carseats$Sales)
testfit$call


Out[91]:
lm(formula = y ~ x)

In [95]:
colnames(Carseats)# Carseats$Price


Out[95]:
  1. 'Sales'
  2. 'CompPrice'
  3. 'Income'
  4. 'Advertising'
  5. 'Population'
  6. 'Price'
  7. 'ShelveLoc'
  8. 'Age'
  9. 'Education'
  10. 'Urban'
  11. 'US'

In [97]:
x = "Price"
index = which(colnames(Carseats) == x)
colnames(Carseats)[index]


Out[97]:
'Price'

In [89]:
data(iris)
class(iris)

irisfit = lm(Sepal.Length ~ Sepal.Width, data = iris)
#summary(irisfit)
irisfit$coefficients[2]


Out[89]:
'data.frame'
Out[89]:
Sepal.Width: -0.2233610611299